Reg. No: 18BCE1227

Name: C Koushik


About the dataset

The dataset package provided consists of 6 files containing the details of donations, donors, schools, projects, teachers and resources.

Source: Kaggle


Background and Objective

DonorsChoose.org is a US based non-profit organisation founded in the year 2000, that helps public school classroom projects get funded directly by helpful donors. Through charity, they make it easy for anyone to help a classroom in need. The organisation wants to inspire active donors to donate again towards projects they feel strong about. In order to do that, the organisation wants to pair the donors with related classrooms based on their previous donations and interests so that they are motivated to donate again.

Sometimes there are chances that some projects may not come into light but are desperate for donations. Therefore, we build a recommendation system that recommends categories of projects to teachers across based on previous donations or interests through RFM (Recency, Frequency, Monetory value) clustering analysis to group similar types of donors mapped with the respective project category through recommendation.


Plans for Review

Review 1: Defining problem statement and reading the dataset followed by basic commands.
Review 2: Exploratory data analysis.
Review 3: To perform clustering based on RFM analysis and recommend project categories.

How does this analysis help?

Before making recommendations, it would be great to know the right set of donors based on their recent activities, how they actively involve themselves or stay less connected.

Helps find the right set of projects for first time donors. Since we’re not usually aware of their preferences yet, its difficult to comprehend their interests right away, leading to a cold start problem. So using the clustering technique can solve such problems.


Review 1


Setup

rm(list = ls())
#Importing libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.0.5
library(IRdisplay)
## Warning: package 'IRdisplay' was built under R version 4.0.5
library(viridis)
## Warning: package 'viridis' was built under R version 4.0.5
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.0.5
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.4
library(stringr)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
library(clValid)
## Warning: package 'clValid' was built under R version 4.0.5
## Loading required package: cluster

Basic commands

df_donations=read.csv('Donations.csv')
df_donors=read.csv('Donors.csv')
df_projects=read.csv('Projects.csv')
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
df_resources=read.csv('Resources.csv')
df_schools=read.csv('Schools.csv')
df_teachers=read.csv('Teachers.csv')
str(df_donations)
## 'data.frame':    4687884 obs. of  7 variables:
##  $ Project.ID                         : chr  "000009891526c0ade7180f8423792063" "000009891526c0ade7180f8423792063" "000009891526c0ade7180f8423792063" "000009891526c0ade7180f8423792063" ...
##  $ Donation.ID                        : chr  "688729120858666221208529ee3fc18e" "dcf1071da3aa3561f91ac689d1f73dee" "18a234b9d1e538c431761d521ea7799d" "38d2744bf9138b0b57ed581c76c0e2da" ...
##  $ Donor.ID                           : chr  "1f4b5b6e68445c6c4a0509b3aca93f38" "4aaab6d244bf3599682239ed5591af8a" "0b0765dc9c759adc48a07688ba25e94e" "377944ad61f72d800b25ec1862aec363" ...
##  $ Donation.Included.Optional.Donation: chr  "No" "Yes" "Yes" "Yes" ...
##  $ Donation.Amount                    : num  178 25 20 25 25 ...
##  $ Donor.Cart.Sequence                : int  11 2 3 1 2 1 1 2 2 44 ...
##  $ Donation.Received.Date             : chr  "2016-08-23 13:15:57" "2016-06-06 20:05:23" "2016-06-06 14:08:46" "2016-05-15 10:23:04" ...
str(df_donors)
## 'data.frame':    2122640 obs. of  5 variables:
##  $ Donor.ID        : chr  "00000ce845c00cbf0686c992fc369df4" "00002783bc5d108510f3f9666c8b1edd" "00002d44003ed46b066607c5455a999a" "00002eb25d60a09c318efbd0797bffb5" ...
##  $ Donor.City      : chr  "Evanston" "Appomattox" "Winton" "Indianapolis" ...
##  $ Donor.State     : chr  "Illinois" "other" "California" "Indiana" ...
##  $ Donor.Is.Teacher: chr  "No" "No" "Yes" "No" ...
##  $ Donor.Zip       : chr  "602" "245" "953" "462" ...
str(df_projects)
## 'data.frame':    34919 obs. of  18 variables:
##  $ Project.ID                      : chr  "7685f0265a19d7b52a470ee4bac883ba" "f9f4af7099061fb4bf44642a03e5c331" "afd99a01739ad5557b51b1ba0174e832" "c614a38bb1a5e68e2ae6ad9d94bb2492" ...
##  $ School.ID                       : chr  "e180c7424cb9c68cb49f141b092a988f" "08b20f1e2125103ed7aa17e8d76c71d4" "1287f5128b1f36bf8434e5705a7cc04d" "900fec9cd7a3188acbc90586a09584ef" ...
##  $ Teacher.ID                      : chr  "4ee5200e89d9e2998ec8baad8a3c5968" "cca2d1d277fb4adb50147b49cdc3b156" "6c5bd0d4f20547a001628aefd71de89e" "8ed6f8181d092a8f4c008b18d18e54ad" ...
##  $ Teacher.Project.Posted.Sequence : int  25 3 1 40 2 4 3 57 14 1 ...
##  $ Project.Type                    : chr  "Teacher-Led" "Teacher-Led" "Teacher-Led" "Teacher-Led" ...
##  $ Project.Title                   : chr  "Stand Up to Bullying: Together We Can!" "Learning in Color!" "Help Second Grade ESL Students Develop Language to Speak and Read" "Help Bilingual Students Strengthen Reading Comprehension" ...
##  $ Project.Essay                   : chr  "Did you know that 1-7 students in grades K-12 is either a bully or a victim of bullying? My goal is to raise pr"| __truncated__ "Help us have a fun, interactive listening center in our class! Did you struggle to read when you were younger? "| __truncated__ "Visiting or moving to a new place can be very exciting yet frightening at the same time especially if you are u"| __truncated__ "Students at our school are still working hard to become proficient in English. A great series like The Sisters "| __truncated__ ...
##  $ Project.Short.Description       : chr  "Did you know that 1-7 students in grades K-12 is either a bully or a victim of bullying? My goal is to raise pr"| __truncated__ "Help us have a fun, interactive listening center in our class! Did you struggle to read when you were younger? "| __truncated__ "Visiting or moving to a new place can be very exciting yet frightening at the same time especially if you are u"| __truncated__ "Students at our school are still working hard to become proficient in English. A great series like The Sisters "| __truncated__ ...
##  $ Project.Need.Statement          : chr  "My students need 25 copies of \"Bullying in Schools\" for each to keep, \"Stand Up For Yourself and Your Friend"| __truncated__ "My students need a listening center, read along books on CD, and headphones for the computers." "My students need beginning vocabulary audio cards and a CD player so they can be able to see and hear words in "| __truncated__ "My students need one copy of each book in The Sisters Grimm series to support literacy growth." ...
##  $ Project.Subject.Category.Tree   : chr  "Applied Learning" "Applied Learning, Literacy & Language" "Literacy & Language" "Literacy & Language" ...
##  $ Project.Subject.Subcategory.Tree: chr  "Character Education, Early Development" "Early Development, Literacy" "ESL" "ESL, Literacy" ...
##  $ Project.Grade.Level.Category    : chr  "Grades PreK-2" "Grades PreK-2" "Grades PreK-2" "Grades 3-5" ...
##  $ Project.Resource.Category       : chr  "Technology" "Technology" "Supplies" "Books" ...
##  $ Project.Cost                    : num  362 513 436 161 264 ...
##  $ Project.Posted.Date             : chr  "2013-01-01" "2013-01-01" "2013-01-01" "2013-01-01" ...
##  $ Project.Expiration.Date         : chr  "2013-05-30" "2013-05-31" "2013-05-30" "2013-05-31" ...
##  $ Project.Current.Status          : chr  "Fully Funded" "Expired" "Fully Funded" "Fully Funded" ...
##  $ Project.Fully.Funded.Date       : chr  "2013-01-11" "" "2013-05-22" "2013-02-06" ...
str(df_resources)
## 'data.frame':    7210448 obs. of  5 variables:
##  $ Project.ID          : chr  "000009891526c0ade7180f8423792063" "00000ce845c00cbf0686c992fc369df4" "00002d44003ed46b066607c5455a999a" "00002d44003ed46b066607c5455a999a" ...
##  $ Resource.Item.Name  : chr  "chair move and store cart" "sony mdr zx100 blk   headphones" "gaiam kids stay-n-play balance ball, grey" "cf520x - giant comfy pillows - set of 4" ...
##  $ Resource.Quantity   : num  1 40 4 1 1 2 3 1 1 1 ...
##  $ Resource.Unit.Price : num  350 12.9 19 269 131.8 ...
##  $ Resource.Vendor.Name: chr  "" "CDW-G" "Amazon Business" "Lakeshore Learning Materials" ...
str(df_schools)
## 'data.frame':    72993 obs. of  9 variables:
##  $ School.ID                   : chr  "00003e0fdd601b8ea0a6eb44057b9c5e" "00004e32a448b4832e1b993500bf0731" "0002021bb799f28de224f1acc1ff08c4" "0004604f675212a8cac1161338265196" ...
##  $ School.Name                 : chr  "Capon Bridge Middle School" "The Woodlands College Park High School" "Samantha Smith Elementary School" "Kingsbury Country Day School" ...
##  $ School.Metro.Type           : chr  "rural" "urban" "suburban" "unknown" ...
##  $ School.Percentage.Free.Lunch: int  56 41 2 76 50 63 17 15 46 29 ...
##  $ School.State                : chr  "West Virginia" "Texas" "Washington" "Michigan" ...
##  $ School.Zip                  : int  26711 77384 98074 48370 75573 85706 10029 29045 95122 60025 ...
##  $ School.City                 : chr  "Capon Bridge" "The Woodlands" "Sammamish" "Oxford" ...
##  $ School.County               : chr  "Hampshire" "Montgomery" "King" "Oakland" ...
##  $ School.District             : chr  "Hampshire Co School District" "Conroe Ind School District" "Lake Washington Sch Dist 414" "Michigan Dept Of Education" ...
str(df_teachers)
## 'data.frame':    402900 obs. of  3 variables:
##  $ Teacher.ID                       : chr  "00000f7264c27ba6fea0c837ed6aa0aa" "00002d44003ed46b066607c5455a999a" "00006084c3d92d904a22e0a70f5c119a" "0000a9af8b6b9cc9e41f53322a8b8cf1" ...
##  $ Teacher.Prefix                   : chr  "Mrs." "Mrs." "Mr." "Ms." ...
##  $ Teacher.First.Project.Posted.Date: chr  "2013-08-21" "2016-10-23" "2016-09-08" "2015-10-25" ...
head(df_donations)
##                         Project.ID                      Donation.ID
## 1 000009891526c0ade7180f8423792063 688729120858666221208529ee3fc18e
## 2 000009891526c0ade7180f8423792063 dcf1071da3aa3561f91ac689d1f73dee
## 3 000009891526c0ade7180f8423792063 18a234b9d1e538c431761d521ea7799d
## 4 000009891526c0ade7180f8423792063 38d2744bf9138b0b57ed581c76c0e2da
## 5 000009891526c0ade7180f8423792063 5a032791e31167a70206bfb86fb60035
## 6 000009891526c0ade7180f8423792063 8cea27f0cc03f41f66aab96b284ae6a1
##                           Donor.ID Donation.Included.Optional.Donation
## 1 1f4b5b6e68445c6c4a0509b3aca93f38                                  No
## 2 4aaab6d244bf3599682239ed5591af8a                                 Yes
## 3 0b0765dc9c759adc48a07688ba25e94e                                 Yes
## 4 377944ad61f72d800b25ec1862aec363                                 Yes
## 5 6d5b22d39e68c656071a842732c63a0c                                 Yes
## 6 896c75c9b8d9a91c759746e566cd3f37                                 Yes
##   Donation.Amount Donor.Cart.Sequence Donation.Received.Date
## 1          178.37                  11    2016-08-23 13:15:57
## 2           25.00                   2    2016-06-06 20:05:23
## 3           20.00                   3    2016-06-06 14:08:46
## 4           25.00                   1    2016-05-15 10:23:04
## 5           25.00                   2    2016-05-17 01:23:38
## 6           15.00                   1    2016-06-04 17:58:55
head(df_donors)
##                           Donor.ID   Donor.City Donor.State Donor.Is.Teacher
## 1 00000ce845c00cbf0686c992fc369df4     Evanston    Illinois               No
## 2 00002783bc5d108510f3f9666c8b1edd   Appomattox       other               No
## 3 00002d44003ed46b066607c5455a999a       Winton  California              Yes
## 4 00002eb25d60a09c318efbd0797bffb5 Indianapolis     Indiana               No
## 5 0000300773fe015f870914b42528541b     Paterson  New Jersey               No
## 6 00004c31ce07c22148ee37acd0f814b9                    other               No
##   Donor.Zip
## 1       602
## 2       245
## 3       953
## 4       462
## 5       075
## 6
head(df_projects)
##                         Project.ID                        School.ID
## 1 7685f0265a19d7b52a470ee4bac883ba e180c7424cb9c68cb49f141b092a988f
## 2 f9f4af7099061fb4bf44642a03e5c331 08b20f1e2125103ed7aa17e8d76c71d4
## 3 afd99a01739ad5557b51b1ba0174e832 1287f5128b1f36bf8434e5705a7cc04d
## 4 c614a38bb1a5e68e2ae6ad9d94bb2492 900fec9cd7a3188acbc90586a09584ef
## 5 ec82a697fab916c0db0cdad746338df9 3b200e7fe3e6dde3c169c02e5fb5ae86
## 6 563958074d7b12b48b939279eb59e6ca b79a19772090efccde93b3a5934d829f
##                         Teacher.ID Teacher.Project.Posted.Sequence Project.Type
## 1 4ee5200e89d9e2998ec8baad8a3c5968                              25  Teacher-Led
## 2 cca2d1d277fb4adb50147b49cdc3b156                               3  Teacher-Led
## 3 6c5bd0d4f20547a001628aefd71de89e                               1  Teacher-Led
## 4 8ed6f8181d092a8f4c008b18d18e54ad                              40  Teacher-Led
## 5 893173d62775f8be7c30bf4220ad0c33                               2  Teacher-Led
## 6 5ef1793ff657860ca7856d475715ec2a                               4  Teacher-Led
##                                                       Project.Title
## 1                            Stand Up to Bullying: Together We Can!
## 2                                                Learning in Color!
## 3 Help Second Grade ESL Students Develop Language to Speak and Read
## 4          Help Bilingual Students Strengthen Reading Comprehension
## 5                                  Help Us Make Each Minute Count! 
## 6                                It's about Time...  Time for Kids!
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Project.Essay
## 1 Did you know that 1-7 students in grades K-12 is either a bully or a victim of bullying? My goal is to raise prevention through being educated and aware of the effects and consequences of bullying. They need to know how to access a science-base, research-validated curriculum via Internet and books. <!--DONOTREMOVEESSAYDIVIDER-->We are part of a small pre-k and kindergarten primary center. Our student population is less than 300 students. My students are learning to stand up to bullying. They are great kids discovering the world and learning to read and write. But I worry that next year when they leave our primary center and attend their neighborhood school (student population over 500) they will face a very different environment. I hope that the lessons that they have learned about friendships, kindness, and working cooperatively will reduce any instances of bullying. I want to raise awareness and educate my students so they can ask for help when they needed. Vigilance and discipline is the key! <!--DONOTREMOVEESSAYDIVIDER-->We know that knowledge is power; I want my students to know how to access information that will help them stand up to bullying. My project is to work with them in group discussions using their book, "Bullying in Schools" as they will all have a copy to keep and share with their parents. I will use the tablet with a small group of four kids at a time. Together we will navigate web sites to watch videos and testimonials. They can then talk about it, share with their parents, and I can lead class discussions. \n\nThere is access to information, articles and poems, and making them aware will educate them further on bullying. We must rise up and be the change to reach bullying in the heart of the problem. Bullying takes many forms and can happen in many contexts. Bullying is complex and there is no one size fits all solution for it, but knowledge is power. My students need to know how to find answers and information when they need it. <!--DONOTREMOVEESSAYDIVIDER-->This problem of bullying in schools is not one without a solution. We need every one helping teachers, parents, administrators and people in our community. The facts show that it is estimated that 160,000 children miss school every day due to fear of attack or intimidation by other students. Being aware of these facts should raise awareness and support. Your efforts will certainly have an impact. Together we can stand up to bullying. 
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Help us have a fun, interactive listening center in our class! Did you struggle to read when you were younger?  Did it help to see the words and hear them read to you at the same time?  This listening center will help the students in my class that need a little extra auditory and visual guidance.\n\n <!--DONOTREMOVEESSAYDIVIDER-->I teach 22 awesome second-grade students in Georgia. We are a Title I school with over 94% free or reduced lunch. The students come to school excited and ready to learn every day. My students love to read, but a large portion of them struggle to read on grade level. They are awesome at math, but need concrete examples and hands-on activities to help them retain the information. <!--DONOTREMOVEESSAYDIVIDER-->I am requesting a listening center, read along books on CD, and headphones for our computers.  Many students in my class need to hear, see, and interact with the reading material to gain meaning from it.  Our CD player broke a while back and the students miss listening to the books on CD.  The new 6 person listening center will allow the students to listen to the same book and then complete meaningful activities together about the story and its parts.  The new headphones for the computers will allow the students to play educational games and listen to stories without the noise from the rest of the class. interrupting them.  We are ready to listen up and learn!\n\n <!--DONOTREMOVEESSAYDIVIDER-->This project is important to the success of the students in my class. Having the the listening center in the class will allow the students the opportunity to listen to and interact with the reading materials with more confidence and ease.  The listening center will be a great addition to our center rotation time. 
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Visiting or moving to a new place can be very exciting yet frightening at the same time especially if you are unfamiliar with the native language. Learning a second language is a difficult challenge to overcome when there is not enough resources or support. <!--DONOTREMOVEESSAYDIVIDER-->Let me introduce you to my 28 second grade shining stars. The class ranges from beginner ESL students to advance ESL students located in an urban neighborhood in New York City. These students come from all over the world and are eager to learn and tackle the challenges of the school day.   <!--DONOTREMOVEESSAYDIVIDER-->By the time the students have reached the second grade they are already used to a normal school day routine. In second grade students are enriching their language, social, math and reading skills. However, the students that have just arrived to the country are shy, not used to NYC public schools and cannot communicate due to lack of language. The students must learn the language first before they can master fluency and reading comprehension. With the audio cards the students will be able to see pictures, read, and hear vocabulary words to enhance their language skills and learn how to use them in sentences. The students will develop fluency, recognize letters, sounds and sight words.  <!--DONOTREMOVEESSAYDIVIDER-->Teaching students English is very important. I want to provide the students with the tools they need to understand the English language and provide a structured environment for continuous growth. Once the students have acquired the language they'll feel more confident in school and can move on to develop the skills needed to give them a successful life. It is vital that they have as many resources available to help them become confident English speakers and develop an everlasting love of learning. 
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        Students at our school are still working hard to become proficient in English. A great series like The Sisters Grimm will help students improve reading comprehension by providing elements that support struggling readers: high-interest, familiar characters, humor, and suspense. <!--DONOTREMOVEESSAYDIVIDER-->Students at our school face many challenges on a daily basis. Many students at our school are working hard to learn English. Students deal with issues of poverty as well. However, the students at our school are very resilient and determined. They work very hard and are making gains, but they still need support from donors like you. <!--DONOTREMOVEESSAYDIVIDER-->Students at our school often have few books to read at home, and studies show that reading at home is an important factor in student's growth as a reader. The books you provide will be added directly to the students' classroom library, where they will be able to check out books for in-class and at-home reading. Additionally, the books in this great series, The Sisters Grimm, are ideal for students who still find reading a bit daunting, because the students become familiar with the characters and format of the books. <!--DONOTREMOVEESSAYDIVIDER-->Thank you very much for considering a donation to these wonderful children. They truly strive to succeed each day, and almost all have dreams of going to college one day. Your kindness in the form of a donation would make an impact in the lives of these deserving young people. 
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     "Idle hands" were something that Issac Watts spoke of, but when students with special needs are left with idle hands, it can result in mischievous actions. Help us keep our hands busy with work!   <!--DONOTREMOVEESSAYDIVIDER-->My students have short attention spans for most kind of things, but things that they can get their hands into and work on hold their attention for longer times than I always can. We are busy bodies, and we need things to help keep our hands busy during the day to stay out of trouble and learn how to work independently on things.  <!--DONOTREMOVEESSAYDIVIDER-->My students need items such as Velcro, two pounds of multicolored Theraputty  and a dozen Gator Grabber Tweezers, to work independently and to make different work stations.  Our school may not always have the money to get things to keep us hands-on all of the time, so help us stay busy and focused by helping us get these items! Our lives are enhanced by having the ability to learn how to complete tasks independently and by learning hands on pre-vocational skills. The light covers will also help us to be able to sit at our desks and tolerate the lights that sometimes can be hard for our overloaded sensory systems to handle.  <!--DONOTREMOVEESSAYDIVIDER-->Help us to learn the skills we need to keep our hands and our minds busy! We thrive from being able to do hands-on projects and tasks. Our teacher wants to be able to provide us with the activities and tasks to help us to be successful!  
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         We know that success in school is directly related to a student's ability to read and comprehend non-fiction text. As students move into the middle and high school years, more of their education is tied to textbooks and non-fiction reading. I want to get my students comfortable with this genre. <!--DONOTREMOVEESSAYDIVIDER-->Nearly 80% of the students in my school are eligible for free or reduced priced meals. Many of these children just don't have access to the variety of literacy materials that students in more affluent areas may take for granted. My students are hungry for knowledge and I want to feed that hunger with more non-fiction text.  <!--DONOTREMOVEESSAYDIVIDER-->TIME For Kids is a weekly classroom news magazine that motivates kids to read. Issues cover a wide range of real-world topics and themes kids love to learn about and helps students become informed their world. \n\nHaving access to this type of high interest, non-fiction text will provide my students with the opportunity to sharpen their reading and comprehension abilities. <!--DONOTREMOVEESSAYDIVIDER-->The achievement gap between children from lower socio-economic backgrounds and those from more affluent backgrounds is getting larger all the time. This project will help me to give my students the chance they need to be better readers! 
##                                                                                                                                                                               Project.Short.Description
## 1        Did you know that 1-7 students in grades K-12 is either a bully or a victim of bullying? My goal is to raise prevention through being educated and aware of the effects and consequences of...
## 2       Help us have a fun, interactive listening center in our class! Did you struggle to read when you were younger? Did it help to see the words and hear them read to you at the same time? This...
## 3       Visiting or moving to a new place can be very exciting yet frightening at the same time especially if you are unfamiliar with the native language. Learning a second language is a difficult...
## 4   Students at our school are still working hard to become proficient in English. A great series like The Sisters Grimm will help students improve reading comprehension by providing elements that...
## 5 "Idle hands" were something that Issac Watts spoke of, but when students with special needs are left with idle hands, it can result in mischievous actions. Help us keep our hands busy with work!...
## 6 We know that success in school is directly related to a student's ability to read and comprehend non-fiction text. As students move into the middle and high school years, more of their education...
##                                                                                                                                                                                                                                                         Project.Need.Statement
## 1 My students need 25 copies of "Bullying in Schools" for each to keep, "Stand Up For Yourself and Your Friends: Dealing With Bullies", "A Smart Kid's Guide to Online Bullying" and a tablet to research specific websites that talk about prevention strategies on bullying.
## 2                                                                                                                                                                               My students need a listening center, read along books on CD, and headphones for the computers.
## 3                                                                                                                    My students need beginning vocabulary audio cards and a CD player so they can be able to see and hear words in the correct use for language development. 
## 4                                                                                                                                                                               My students need one copy of each book in The Sisters Grimm series to support literacy growth.
## 5                                                                                                My students need items such as Velcro, two pounds of multicolored Theraputty  and a dozen Gator Grabber Tweezers, to work independently and to make different work stations. 
## 6                                                                                                                                                                                                                          My students need 24 subscriptions to Time for Kids!
##           Project.Subject.Category.Tree       Project.Subject.Subcategory.Tree
## 1                      Applied Learning Character Education, Early Development
## 2 Applied Learning, Literacy & Language            Early Development, Literacy
## 3                   Literacy & Language                                    ESL
## 4                   Literacy & Language                          ESL, Literacy
## 5                         Special Needs                          Special Needs
## 6 Literacy & Language, History & Civics              Literacy, Social Sciences
##   Project.Grade.Level.Category Project.Resource.Category Project.Cost
## 1                Grades PreK-2                Technology       361.80
## 2                Grades PreK-2                Technology       512.85
## 3                Grades PreK-2                  Supplies       435.92
## 4                   Grades 3-5                     Books       161.26
## 5                   Grades 3-5                  Supplies       264.19
## 6                   Grades 3-5                     Other       175.15
##   Project.Posted.Date Project.Expiration.Date Project.Current.Status
## 1          2013-01-01              2013-05-30           Fully Funded
## 2          2013-01-01              2013-05-31                Expired
## 3          2013-01-01              2013-05-30           Fully Funded
## 4          2013-01-01              2013-05-31           Fully Funded
## 5          2013-01-01              2013-05-30           Fully Funded
## 6          2013-01-01              2013-05-31           Fully Funded
##   Project.Fully.Funded.Date
## 1                2013-01-11
## 2                          
## 3                2013-05-22
## 4                2013-02-06
## 5                2013-01-01
## 6                2013-02-01
head(df_resources)
##                         Project.ID                        Resource.Item.Name
## 1 000009891526c0ade7180f8423792063                 chair move and store cart
## 2 00000ce845c00cbf0686c992fc369df4           sony mdr zx100 blk   headphones
## 3 00002d44003ed46b066607c5455a999a gaiam kids stay-n-play balance ball, grey
## 4 00002d44003ed46b066607c5455a999a   cf520x - giant comfy pillows - set of 4
## 5 00002d44003ed46b066607c5455a999a             serta lounger, mini, sky blue
## 6 00002d44003ed46b066607c5455a999a   big joe roma bean bag chair, spicy lime
##   Resource.Quantity Resource.Unit.Price         Resource.Vendor.Name
## 1                 1              350.00                             
## 2                40               12.86                        CDW-G
## 3                 4               19.00              Amazon Business
## 4                 1              269.00 Lakeshore Learning Materials
## 5                 1              131.85              Amazon Business
## 6                 2               33.88              Amazon Business
head(df_schools)
##                          School.ID                            School.Name
## 1 00003e0fdd601b8ea0a6eb44057b9c5e             Capon Bridge Middle School
## 2 00004e32a448b4832e1b993500bf0731 The Woodlands College Park High School
## 3 0002021bb799f28de224f1acc1ff08c4       Samantha Smith Elementary School
## 4 0004604f675212a8cac1161338265196           Kingsbury Country Day School
## 5 0004c9d50bcf0cea990f844e58b5e2c3             Redwater Elementary School
## 6 0004ffe3558fd70d939ad522b92447c8         Math & Science Success Academy
##   School.Metro.Type School.Percentage.Free.Lunch  School.State School.Zip
## 1             rural                           56 West Virginia      26711
## 2             urban                           41         Texas      77384
## 3          suburban                            2    Washington      98074
## 4           unknown                           76      Michigan      48370
## 5             rural                           50         Texas      75573
## 6           unknown                           63       Arizona      85706
##     School.City School.County              School.District
## 1  Capon Bridge     Hampshire Hampshire Co School District
## 2 The Woodlands    Montgomery   Conroe Ind School District
## 3     Sammamish          King Lake Washington Sch Dist 414
## 4        Oxford       Oakland   Michigan Dept Of Education
## 5      Redwater         Bowie    Redwater Ind Sch District
## 6        Tucson          Pima    Arizona Dept Of Education
head(df_teachers)
##                         Teacher.ID Teacher.Prefix
## 1 00000f7264c27ba6fea0c837ed6aa0aa           Mrs.
## 2 00002d44003ed46b066607c5455a999a           Mrs.
## 3 00006084c3d92d904a22e0a70f5c119a            Mr.
## 4 0000a9af8b6b9cc9e41f53322a8b8cf1            Ms.
## 5 0000d4777d14b33a1406dd6c9019fe89            Ms.
## 6 0000fc11407901bcacdfad1db909b9f6           Mrs.
##   Teacher.First.Project.Posted.Date
## 1                        2013-08-21
## 2                        2016-10-23
## 3                        2016-09-08
## 4                        2015-10-25
## 5                        2017-02-10
## 6                        2015-06-22

Review 2


Exploratory Data Analysis


Donations dataset

The file contains 4687884 records with 7 variables. They are Project ID, Donation ID, Donor ID, Donation Included Optional Donation, Donation Amount, Donor Cart Sequence, Donation Received Date.

Project ID: Unique identifier of a donor.
Donation ID: Unique ID of a donation.
Donor ID: The donor’s state.
Donation Included Optional Donation: Yes/No to give 15% of donation amount to Donoschoose.org.
Donation Amount: Total amount donated for a project.
Donor Cart Sequence: Project position on list of desired donations within Cart list.
Donation Received Date: Date and time on which the donation was received.

#Checking for null values
sum(is.na(df_donations))
## [1] 0

No null values present in this dataset.

summary(df_donations)
##   Project.ID        Donation.ID          Donor.ID        
##  Length:4687884     Length:4687884     Length:4687884    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  Donation.Included.Optional.Donation Donation.Amount    Donor.Cart.Sequence
##  Length:4687884                      Min.   :    0.01   Min.   :    1.0    
##  Class :character                    1st Qu.:   14.82   1st Qu.:    1.0    
##  Mode  :character                    Median :   25.00   Median :    2.0    
##                                      Mean   :   60.67   Mean   :  143.1    
##                                      3rd Qu.:   50.00   3rd Qu.:   12.0    
##                                      Max.   :60000.00   Max.   :18116.0    
##  Donation.Received.Date
##  Length:4687884        
##  Class :character      
##  Mode  :character      
##                        
##                        
## 
#Finding number of unique records for all variables
df_donations %>% summarise_all(funs(n_distinct))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
##   Project.ID Donation.ID Donor.ID Donation.Included.Optional.Donation
## 1     901965     4687844  2024554                                   2
##   Donation.Amount Donor.Cart.Sequence Donation.Received.Date
## 1           76192               18071                4066370

We observe that there are 4687844 unique donations, 900K projects and 2M donors.

#Distribution of Donation Amount
ggplot(df_donations, aes(x=Donation.Amount))+
  geom_boxplot(fill="#40B0A6")+coord_flip()

data.frame(min_amount=min(df_donations$Donation.Amount),
           max_amount=max(df_donations$Donation.Amount),
           mean_amount=mean(df_donations$Donation.Amount),
           median_amount=median(df_donations$Donation.Amount)
           )
##   min_amount max_amount mean_amount median_amount
## 1       0.01      60000    60.66879            25

From the above boxplot and table, the median and mean values, we can see that most of the donations lie in the range of 25-100 USD roughly. Due to the presence of extreme outliers the box plot is flattened. Therefore to visualize the boxplot distribution, we can neglect outliers and filter those donations below 100 USD

donations = df_donations%>%filter(Donation.Amount<100)
ggplot(donations, aes(x=Donation.Amount))+
  geom_boxplot(fill="#40B0A6")+coord_flip()

donations %>% 
  ggplot(aes(Donation.Amount))+
  geom_density(alpha=0.3)+ geom_vline(aes(xintercept=median(Donation.Amount)),
             color="red", linetype="dashed", size=1)+
  labs(col = "Optional Donation",
       x = "Donation Amount",
       y = "Density")

ggplot(df_donations,aes(x=Donation.Included.Optional.Donation, fill=Donation.Included.Optional.Donation))+ geom_bar()+theme_bw()

Most of the donors have opted for Optional Donations.

#One time donors
df_donations%>%filter(Donor.Cart.Sequence<=30)%>%
group_by(Donor.Cart.Sequence)%>%summarize(count=n())%>%
  mutate(Donor.Cart.Sequence=as.factor(Donor.Cart.Sequence))%>%
  ggplot(aes(x=Donor.Cart.Sequence, y=count, group=1))+geom_line(stat="identity", color="blue") + 
  geom_point(stat="identity", color="red")+ labs(x="The Donor cart sequence",
                                                 y="no of Donors", 
                                                 title="The difference in the no. of one time donors and the rest")
## `summarise()` ungrouping output (override with `.groups` argument)

From the above plotm we can see that most of them are one-time donors. Let’s find out the percentage of one-time donors.

print(paste("the Percentage of one time Donors is ",((df_donations%>%filter(Donor.Cart.Sequence==1) %>%nrow)-(df_donations%>%filter(Donor.Cart.Sequence==2) %>%nrow)) /(df_donations%>%filter(Donor.Cart.Sequence==1) %>%nrow)*100,"%"))
## [1] "the Percentage of one time Donors is  69.7128941025263 %"
#Feature Engineering the Received Date
df_donations$Donation.year=year(df_donations$Donation.Received.Date)
df_donations$Donation.month=month(df_donations$Donation.Received.Date)
#Trend of donation amount over the years
df_donations %>% group_by(Donation.year, Donation.month) %>% summarise(Yearly.donation = sum(Donation.Amount))%>%
ggplot(aes(Donation.year, Yearly.donation, size = Yearly.donation,colour = as.factor(Donation.month), frame = Donation.year)) +
  geom_point() + labs(x="Years",
                      y="Total Donation Amount", 
                      title="Year-month wise total donation amount")+
  theme_bw()
## `summarise()` regrouping output by 'Donation.year' (override with `.groups` argument)

#transition_time(Donation.year)
#gif<-animate(g, width=800, height=400, renderer=gifski_renderer(loop=FALSE))  
#save_animation(gif,"output.gif")
#display_html('<iframe src="output.gif" width=70% height=500></iframe>')

We see that the donations received are lowest in the month of June, while maximum donations are being made in the month of December.

#Trend of donation amount over the months
df_donations %>% group_by(Donation.month) %>% summarise(Monthly.donation = sum(Donation.Amount))%>%
ggplot(aes(as.factor(Donation.month), Monthly.donation, colour = as.factor(Donation.month), group=1)) + geom_line(color="purple")+
  geom_point() +labs(x="Months",
                      y="Total Donation Amount", 
                      title="Month wise total donation amount")+
  theme_bw()
## `summarise()` ungrouping output (override with `.groups` argument)

We can notice a dip in donations around the months of April, May, and June followed by a steep rise. This could be due to return from summer holidays.

Donors Dataset

This file contains 2122640 observations with 6 variables. They are,
Donor ID: Unique identifier of a donor.
Donor City: The donor’s city.
Donor State: The donor’s state.
Donor Is Teacher: Whether or not the donor is also a teacher with a DonorsChoose.org teacher account.
Donor Zip: The donor’s zip code (only first 3 digits).

glimpse(df_donors)
## Rows: 2,122,640
## Columns: 5
## $ Donor.ID         <chr> "00000ce845c00cbf0686c992fc369df4", "00002783bc5d1...
## $ Donor.City       <chr> "Evanston", "Appomattox", "Winton", "Indianapolis"...
## $ Donor.State      <chr> "Illinois", "other", "California", "Indiana", "New...
## $ Donor.Is.Teacher <chr> "No", "No", "Yes", "No", "No", "No", "No", "No", "...
## $ Donor.Zip        <chr> "602", "245", "953", "462", "075", "", "069", "543...
df_donors %>% summarise_all(funs(n_distinct))
##   Donor.ID Donor.City Donor.State Donor.Is.Teacher Donor.Zip
## 1  2122640      15205          52                2      1066
#Replacing "" in city with NA values
df_donors = df_donors %>% mutate(Donor.City = replace(Donor.City,Donor.City=="",NA))
#Top Cities in terms of donations, ignoring NA vakues
df_donors %>% group_by(Donor.City) %>% na.omit()%>% summarise(count = length(Donor.City)) %>% top_n(10, wt = count)%>%
ggplot(aes(x = reorder(Donor.City, count), y = count, fill = count)) + 
   geom_bar(stat = 'identity') +  scale_fill_viridis(direction = 1)+coord_flip()+
    theme_bw()+ xlab("City") +ggtitle("Top 10 cities w.r.t total number of donors")
## `summarise()` ungrouping output (override with `.groups` argument)

The top cities from where the donors belong are Chicago, New York, Brooklyn, etc.

#Donors map, distribution across the states
#Grouping based on total number of donors
state = df_donors %>% group_by(Donor.State) %>% summarise(number_of_donors = length(Donor.ID))
## `summarise()` ungrouping output (override with `.groups` argument)
state$Donor.State = tolower(state$Donor.State)

#Assigning name of states in USA
master <- c("CALIFORNIA", "ARIZONA", "ARKANSAS","Alabama", "Alaska", "colorado", "CONNECTICUT","Delaware", "DISTRICT OF COLUMBIA", 
  "FLORIDA", "Georgia", "hawaii","Idaho", "Illinois", "Indiana", "Iowa", "kansas", "Kentucky", "LOUISIANA", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota","Mississippi", "Missouri", 
  "MONTANA", "NEBRASKA", "Nevada", "New Hampshire", "New Jersey", "NEW MEXICO", "New York", 
  "North Carolina","North Dakota", "OHIO", "OKLAHOMA", "Oregon", "PANAMA", "Pennsylvania", "Rhode Island",
  "SOUTH CAROLINA", "SOUTH DAKOTA", "Tennessee", "Texas", "UTAH", "Vermont",
  "Virgin Islands", "VIRGINIA", "Washington", "West Virginia", "WISCONSIN", "Wyoming")

master_low = tolower(master)

#State table with state names and abbreviations
state_tbl = setNames(state.abb, tolower(state.name))
state_tbl = data.frame(
  orig=master,
  lower=master_low,
  abbrev=state_tbl[master_low],
  stringsAsFactors=FALSE
)

#Merging state table and state variable by taking common values
merge = merge(state, state_tbl, by.x = "Donor.State", by.y = "lower")
plot_ly(type="choropleth",locations = merge$abbrev, locationmode="USA-states", z=merge$number_of_donors) %>% layout(geo=list(scope="usa"))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

We can observe that California is the highest in terms of total number of donors.

Projects Dataset

This file contains 34919 observations of 18 variables. They are,

Project ID: Unique identifier of a project.
School ID: Unique identifier of a school where the project is proposed from.
Teacher ID: Unique identifier of a teacher who proposed the project.
Teacher Project Posted Sequence: Represents the order as a project issued by the teacher.
Project Type: Type of the project.
Project Title: Title of the project.
Project Essay: Essay of the project.
Project Short Description: Description of the project.
Project Need Statement: Statement for the resources that the project needs.
Project Subject Category Tree: Category of the project.
Project Subject Subcategory Tree: Subcategory of the project.
Project Grade Level Category: Grade level that the project aims.
Project Resource Category: Category of resources that the project needs.
Project Cost: Costs of the project.
Project Posted Date: Date when the project is posted.
Project Expiration Date: Date when the project is expired.
Project Current Status: Current status of the project.
Project Fully Funded Date: Date when the project gets fully funded.

glimpse(df_projects)
## Rows: 34,919
## Columns: 18
## $ Project.ID                       <chr> "7685f0265a19d7b52a470ee4bac883ba"...
## $ School.ID                        <chr> "e180c7424cb9c68cb49f141b092a988f"...
## $ Teacher.ID                       <chr> "4ee5200e89d9e2998ec8baad8a3c5968"...
## $ Teacher.Project.Posted.Sequence  <int> 25, 3, 1, 40, 2, 4, 3, 57, 14, 1, ...
## $ Project.Type                     <chr> "Teacher-Led", "Teacher-Led", "Tea...
## $ Project.Title                    <chr> "Stand Up to Bullying: Together We...
## $ Project.Essay                    <chr> "Did you know that 1-7 students in...
## $ Project.Short.Description        <chr> "Did you know that 1-7 students in...
## $ Project.Need.Statement           <chr> "My students need 25 copies of \"B...
## $ Project.Subject.Category.Tree    <chr> "Applied Learning", "Applied Learn...
## $ Project.Subject.Subcategory.Tree <chr> "Character Education, Early Develo...
## $ Project.Grade.Level.Category     <chr> "Grades PreK-2", "Grades PreK-2", ...
## $ Project.Resource.Category        <chr> "Technology", "Technology", "Suppl...
## $ Project.Cost                     <dbl> 361.80, 512.85, 435.92, 161.26, 26...
## $ Project.Posted.Date              <chr> "2013-01-01", "2013-01-01", "2013-...
## $ Project.Expiration.Date          <chr> "2013-05-30", "2013-05-31", "2013-...
## $ Project.Current.Status           <chr> "Fully Funded", "Expired", "Fully ...
## $ Project.Fully.Funded.Date        <chr> "2013-01-11", "", "2013-05-22", "2...
df_projects %>% summarise_all(funs(n_distinct))
##   Project.ID School.ID Teacher.ID Teacher.Project.Posted.Sequence Project.Type
## 1      34919     14373      24351                             216            1
##   Project.Title Project.Essay Project.Short.Description Project.Need.Statement
## 1         33018         34862                     33598                  34394
##   Project.Subject.Category.Tree Project.Subject.Subcategory.Tree
## 1                            44                              335
##   Project.Grade.Level.Category Project.Resource.Category Project.Cost
## 1                            5                         7        21929
##   Project.Posted.Date Project.Expiration.Date Project.Current.Status
## 1                 149                     256                      3
##   Project.Fully.Funded.Date
## 1                       270
#Feature Engineering
df_projects$Project.Type = as.factor(df_projects$Project.Type)
df_projects$Project.Subject.Category.Tree = as.factor(df_projects$Project.Subject.Category.Tree)
df_projects$Project.Subject.Subcategory.Tree = as.factor(df_projects$Project.Subject.Subcategory.Tree)
df_projects$Project.Resource.Category = as.factor(df_projects$Project.Resource.Category)
df_projects$Project.Current.Status = as.factor(df_projects$Project.Current.Status)
df_projects$Project.Grade.Level.Category = as.factor(df_projects$Project.Grade.Level.Category)
#Distribution of type of projects
levels(df_projects$Project.Type)
## [1] "Teacher-Led"

All of the projects are Teacher-led

#Distribution of project categories
#To extract only main categories
df_projects$cat <- gsub(",.*$", "", df_projects$Project.Subject.Category.Tree)
df_projects$cat = as.factor(df_projects$cat)
levels(df_projects$cat)
## [1] ""                    "Applied Learning"    "Health & Sports"    
## [4] "History & Civics"    "Literacy & Language" "Math & Science"     
## [7] "Music & The Arts"    "Special Needs"
#Replacing "" in cat with NA values
df_projects = df_projects %>% mutate(cat = replace(cat,cat=="",NA))

df_projects %>% group_by(cat) %>% summarise(count = length(Project.ID)) %>% na.omit()%>%top_n(10, wt = count)%>%
ggplot(aes(x = reorder(cat, count), y = count, fill = count)) + 
   geom_bar(stat = 'identity') +scale_fill_viridis(direction = -1)+ coord_flip()+
    theme_bw()+ theme(legend.position = 'none')+xlab("Category") + ylab("Top 10 Categories")
## `summarise()` ungrouping output (override with `.groups` argument)

Literacy & Language, Math & Science and Applied learning are the top 3 categories of projects

#Distribution of project sub categories
#To extract first words of subcategories
df_projects$subcat <- gsub(",.*$", "", df_projects$Project.Subject.Subcategory.Tree)
df_projects$subcat = as.factor(df_projects$subcat)
levels(df_projects$subcat)
##  [1] ""                      "Applied Sciences"      "Character Education"  
##  [4] "Civics & Government"   "College & Career Prep" "Community Service"    
##  [7] "Early Development"     "Economics"             "Environmental Science"
## [10] "ESL"                   "Extracurricular"       "Foreign Languages"    
## [13] "Gym & Fitness"         "Health & Life Science" "Health & Wellness"    
## [16] "History & Geography"   "Literacy"              "Literature & Writing" 
## [19] "Mathematics"           "Music"                 "Nutrition Education"  
## [22] "Other"                 "Parent Involvement"    "Performing Arts"      
## [25] "Social Sciences"       "Special Needs"         "Team Sports"          
## [28] "Visual Arts"
#Replacing "" in subcat with NA values
df_projects = df_projects %>% mutate(subcat = replace(subcat,subcat=="",NA))

df_projects %>% group_by(subcat) %>% summarise(count = length(Project.ID)) %>% na.omit()%>%top_n(10, wt = count)%>%
ggplot(aes(x = reorder(subcat, count), y = count, fill = count)) + 
   geom_bar(stat = 'identity') +scale_fill_viridis(direction = -1)+ coord_flip()+
    theme_bw()+ theme(legend.position = 'none')+xlab("Sub-Category") + ylab("Top 10 Sub-Categories")
## `summarise()` ungrouping output (override with `.groups` argument)

Literacy, Literature & Writing and Mathematics are the top 3 subcategories of project.

#Project funding status
df_projects = df_projects %>% filter(Project.Current.Status!="")
levels(df_projects$Project.Current.Status)
## [1] ""             "Expired"      "Fully Funded"
ggplot(df_projects, aes(x = as.factor(Project.Current.Status), fill = as.factor(Project.Current.Status))) + geom_bar()+theme_bw()+theme(legend.position = 'none') + xlab('Current Status')

print(paste("Percentage of Fully Funded Projects: ",((df_projects%>%filter(Project.Current.Status=="Fully Funded") %>%nrow)-(df_projects%>%filter(Project.Current.Status=="Expired") %>%nrow)) /(df_projects%>%filter(Project.Current.Status=="Fully Funded") %>%nrow)*100,"%"))
## [1] "Percentage of Fully Funded Projects:  61.4969656102495 %"
#Grade levels
levels(df_projects$Project.Grade.Level.Category)
## [1] ""              "Grades 3-5"    "Grades 6-8"    "Grades 9-12"  
## [5] "Grades PreK-2"
ggplot(df_projects, aes(as.factor(Project.Grade.Level.Category), fill = as.factor(Project.Grade.Level.Category))) + geom_bar()+theme_bw()+xlab("Grades")+theme(legend.position = 'none')

Most of the projects belong to PreK-2 grade level, followed by grades 3-5 and grades 6-8

#Resources category
tab <- within(df_projects,  Project.Resource.Category <- factor(Project.Resource.Category, levels=names(sort(table(Project.Resource.Category), decreasing=TRUE))))
levels(df_projects$Project.Resource.Category)
## [1] ""           "Books"      "Other"      "Supplies"   "Technology"
## [6] "Trips"      "Visitors"
ggplot(tab, aes(Project.Resource.Category, fill = Project.Resource.Category)) + geom_bar()+theme_bw()+theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position = 'none')

Most of the resources required for the projects are from technology, supplies and books

Resources dataset

The file contains 7210448 observations of 5 variables. They are:
Project ID: Unique identifier of a project.
Resource Item Name: The name of the requested item, as it appears on the vendor’s website.
Resource Quantity: The quantity of the requested item.
Resource Unit Price: The price per unit of the requested item.
Resource Vendor Name: Name of the vendor.

df_resources %>% summarise_all(funs(n_distinct))
##   Project.ID Resource.Item.Name Resource.Quantity Resource.Unit.Price
## 1    1208649            1073887               342               50254
##   Resource.Vendor.Name
## 1                   32
head(df_resources)
##                         Project.ID                        Resource.Item.Name
## 1 000009891526c0ade7180f8423792063                 chair move and store cart
## 2 00000ce845c00cbf0686c992fc369df4           sony mdr zx100 blk   headphones
## 3 00002d44003ed46b066607c5455a999a gaiam kids stay-n-play balance ball, grey
## 4 00002d44003ed46b066607c5455a999a   cf520x - giant comfy pillows - set of 4
## 5 00002d44003ed46b066607c5455a999a             serta lounger, mini, sky blue
## 6 00002d44003ed46b066607c5455a999a   big joe roma bean bag chair, spicy lime
##   Resource.Quantity Resource.Unit.Price         Resource.Vendor.Name
## 1                 1              350.00                             
## 2                40               12.86                        CDW-G
## 3                 4               19.00              Amazon Business
## 4                 1              269.00 Lakeshore Learning Materials
## 5                 1              131.85              Amazon Business
## 6                 2               33.88              Amazon Business
glimpse(df_resources)
## Rows: 7,210,448
## Columns: 5
## $ Project.ID           <chr> "000009891526c0ade7180f8423792063", "00000ce84...
## $ Resource.Item.Name   <chr> "chair move and store cart", "sony mdr zx100 b...
## $ Resource.Quantity    <dbl> 1, 40, 4, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Resource.Unit.Price  <dbl> 350.00, 12.86, 19.00, 269.00, 131.85, 33.88, 1...
## $ Resource.Vendor.Name <chr> "", "CDW-G", "Amazon Business", "Lakeshore Lea...
#Feature engineering
df_resources$Resource.Vendor.Name=as.factor(df_resources$Resource.Vendor.Name)
levels(df_resources$Resource.Vendor.Name)
##  [1] ""                                   "ABC School Supply"                 
##  [3] "Abilitations"                       "AKJ Education"                     
##  [5] "Amazon Business"                    "Best Buy Education"                
##  [7] "Blick Art Materials"                "Britannica Digital Learning"       
##  [9] "BSN"                                "Cannon Sports"                     
## [11] "Carolina Biological Supply Company" "CDW-G"                             
## [13] "Childcraft"                         "DICK'S Sporting Goods"             
## [15] "Frey Scientific"                    "Grainger"                          
## [17] "Kaplan Early Learning Company"      "Lakeshore Learning Materials"      
## [19] "LEGO Education"                     "MakerBot"                          
## [21] "Nasco"                              "Quill.com"                         
## [23] "Sargent-Welch"                      "Sax Arts & Crafts"                 
## [25] "Scholastic Classroom Magazines"     "School Specialty"                  
## [27] "Sportime"                           "Staples Advantage"                 
## [29] "Teachers' School Supply"            "TIME for Kids"                     
## [31] "Ward's Science"                     "Woodwind and Brasswind"
#Replacing "" with NA
df_resources = df_resources %>% mutate(Resource.Vendor.Name = replace(Resource.Vendor.Name, Resource.Vendor.Name=="", NA))
head(df_resources)
##                         Project.ID                        Resource.Item.Name
## 1 000009891526c0ade7180f8423792063                 chair move and store cart
## 2 00000ce845c00cbf0686c992fc369df4           sony mdr zx100 blk   headphones
## 3 00002d44003ed46b066607c5455a999a gaiam kids stay-n-play balance ball, grey
## 4 00002d44003ed46b066607c5455a999a   cf520x - giant comfy pillows - set of 4
## 5 00002d44003ed46b066607c5455a999a             serta lounger, mini, sky blue
## 6 00002d44003ed46b066607c5455a999a   big joe roma bean bag chair, spicy lime
##   Resource.Quantity Resource.Unit.Price         Resource.Vendor.Name
## 1                 1              350.00                         <NA>
## 2                40               12.86                        CDW-G
## 3                 4               19.00              Amazon Business
## 4                 1              269.00 Lakeshore Learning Materials
## 5                 1              131.85              Amazon Business
## 6                 2               33.88              Amazon Business
#Resource Items word cloud
items <- df_resources %>% sample_n(10000)%>% unnest_tokens(word, Resource.Item.Name)
items <- items %>% anti_join(stop_words, by = "word")
items <- items %>% filter(!str_detect(word, "^\\d"))

items %>% count(word) %>% with(wordcloud(word, n, max=200,rot.per=0.45, colors=brewer.pal(8, "Paired")))
## Warning in wordcloud(word, n, max = 200, rot.per = 0.45, colors =
## brewer.pal(8, : set could not be fit on page. It will not be plotted.

We can see that “set”, “pack”, “book”, “black” are the words appearing prominently in the resources list. We can also see that “assorted”, “apple”, “kit”, “white”, “level”, “colors”, etc appear relatively more frequently than the rest.

Teachers Dataset

The file contains 402900 observations with 3 variables. They are:
Teacher ID: Unique identifier of a teacher.
Teacher Prefix: “Mrs.”, “Ms.”, “Mr.”, “Teacher” (gender neutral option) chosen by teacher during account creation.
Teacher First Project Posted Date: Date on which the teacher’s first project was posted.

df_teachers %>% summarise_all(funs(n_distinct))
##   Teacher.ID Teacher.Prefix Teacher.First.Project.Posted.Date
## 1     402900              7                              4699
glimpse(df_teachers)
## Rows: 402,900
## Columns: 3
## $ Teacher.ID                        <chr> "00000f7264c27ba6fea0c837ed6aa0aa...
## $ Teacher.Prefix                    <chr> "Mrs.", "Mrs.", "Mr.", "Ms.", "Ms...
## $ Teacher.First.Project.Posted.Date <chr> "2013-08-21", "2016-10-23", "2016...
#feature engineering
df_teachers$Teacher.Prefix=as.factor(df_teachers$Teacher.Prefix)
levels(df_teachers$Teacher.Prefix)
## [1] ""        "Dr."     "Mr."     "Mrs."    "Ms."     "Mx."     "Teacher"
#Distribution of teachers based on their prefix
df_teachers$Teacher.Prefix=as.character(df_teachers$Teacher.Prefix)
df_teachers = df_teachers %>% mutate(Teacher.Prefix = replace(Teacher.Prefix, Teacher.Prefix=="", "Other"))
df_teachers$Teacher.Prefix=as.factor(df_teachers$Teacher.Prefix)

tab1 <- within(df_teachers,Teacher.Prefix <- factor(Teacher.Prefix, levels=names(sort(table(Teacher.Prefix), decreasing=TRUE))))
ggplot(tab1, aes(x = Teacher.Prefix, fill = Teacher.Prefix)) + geom_bar()+ theme_bw()

Most of the teachers are referred by “Mrs.”

Schools Dataset

The file consists of 72993 records of 9 variables. They are:
School ID: Unique identifier of a school.
School Name: Name of the school.
School Metro Type: One of four categories describing metro type, or urbanicity, of school area.
School Percentage Free Lunch: Integer describing percentage of students qualifying for free or reduced lunch, obtained from NCES data. For schools without NCES data, a district average is used.
School State: The state of the school that the teacher was teaching at at the time the project was posted.
School Zip: The zip code of the school that the teacher was teaching at at the time the project was posted.
School City: The city of the school that the teacher was teaching at at the time the project was posted.
School County: The county of the school that the teacher was teaching at at the time the project was posted.
School District: The district of the school that the teacher was teaching at at the time the project was posted.

glimpse(df_schools)
## Rows: 72,993
## Columns: 9
## $ School.ID                    <chr> "00003e0fdd601b8ea0a6eb44057b9c5e", "0...
## $ School.Name                  <chr> "Capon Bridge Middle School", "The Woo...
## $ School.Metro.Type            <chr> "rural", "urban", "suburban", "unknown...
## $ School.Percentage.Free.Lunch <int> 56, 41, 2, 76, 50, 63, 17, 15, 46, 29,...
## $ School.State                 <chr> "West Virginia", "Texas", "Washington"...
## $ School.Zip                   <int> 26711, 77384, 98074, 48370, 75573, 857...
## $ School.City                  <chr> "Capon Bridge", "The Woodlands", "Samm...
## $ School.County                <chr> "Hampshire", "Montgomery", "King", "Oa...
## $ School.District              <chr> "Hampshire Co School District", "Conro...
df_schools %>% summarise_all(funs(n_distinct))
##   School.ID School.Name School.Metro.Type School.Percentage.Free.Lunch
## 1     72993       56831                 5                          102
##   School.State School.Zip School.City School.County School.District
## 1           51      19008       10401          1783           10852
ggplot(df_schools, aes(x=School.Percentage.Free.Lunch))+geom_histogram(bins = 15, color="blue",size = 0.25, fill="lightgreen") + geom_vline(xintercept = mean(df_schools$School.Percentage.Free.Lunch, na.rm= "T"), linetype="dotted", 
                color = "red", size=.5) + xlab("Free lunch Percent")+
            ggtitle("Percent of students eligible for free lunch in schools")
## Warning: Removed 1141 rows containing non-finite values (stat_bin).

Almost 60% of the students are eligible for free lunch in schools.

#Merging the datasets
merge2 <-  merge(df_donors, df_donations, by = "Donor.ID", all = FALSE)
merge3 <-  merge(merge2, df_projects, by = "Project.ID", all = FALSE)
merge4 <-  merge(merge3, df_schools, by = "School.ID", all = FALSE)
merge4 %>% summarise_all(funs(n_distinct))
##   School.ID Project.ID Donor.ID Donor.City Donor.State Donor.Is.Teacher
## 1     12796      29430   106665       6224          52                2
##   Donor.Zip Donation.ID Donation.Included.Optional.Donation Donation.Amount
## 1       900      180143                                   2           13500
##   Donor.Cart.Sequence Donation.Received.Date Donation.year Donation.month
## 1                4976                 171434             2             12
##   Teacher.ID Teacher.Project.Posted.Sequence Project.Type Project.Title
## 1      21037                             216            1         27981
##   Project.Essay Project.Short.Description Project.Need.Statement
## 1         29383                     28398                  29045
##   Project.Subject.Category.Tree Project.Subject.Subcategory.Tree
## 1                            44                              331
##   Project.Grade.Level.Category Project.Resource.Category Project.Cost
## 1                            4                         7        19159
##   Project.Posted.Date Project.Expiration.Date Project.Current.Status
## 1                 148                     254                      2
##   Project.Fully.Funded.Date cat subcat School.Name School.Metro.Type
## 1                       270   8     28       11403                 5
##   School.Percentage.Free.Lunch School.State School.Zip School.City
## 1                          102           51       6665        3519
##   School.County School.District
## 1          1116            3482

Does type of location affect the choice of donors?

merge4%>% group_by(School.Metro.Type)%>% summarise(count = length(Donor.ID))%>%
ggplot(aes(x = reorder(School.Metro.Type, -count),count, fill = School.Metro.Type))+geom_bar(stat='identity')+
theme_bw()+xlab('School Metro Type')
## `summarise()` ungrouping output (override with `.groups` argument)

Most of the donations are for to schools in urban and suburban areas.

Does location affect the choice of projects and resources?

pro <- merge4 %>% sample_n(10000)%>%unnest_tokens(word, Project.Title)
pro <- pro %>% anti_join(stop_words, by = "word")
pro <- pro %>% filter(!str_detect(word, "^\\d"))

pro1 <- pro %>% group_by(word, School.Metro.Type) %>% summarise(count = length(word))
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
pro2 <- pro %>% group_by(word) %>% summarise(count = length(word)) %>% rename(word.count = count)
## `summarise()` ungrouping output (override with `.groups` argument)
pro1 %>%left_join(pro2, by = "word") %>%arrange(desc(word.count)) %>%head(50) %>% ungroup() %>%
  ggplot(aes(reorder(word, count), count, fill = School.Metro.Type)) +
  geom_bar(stat = 'identity') +
  xlab(NULL) +
  coord_flip() +
 facet_wrap(~ School.Metro.Type) +theme_bw()+
  theme(legend.position = "none")

We see that the schools located in the urban areas require resources for projects that are inclined towards learning, reading, technology, classroom etc. Whereas, the schools located in the rural areas require some of the basic resources such as seating and books at par with learning and technology requirements.


Review 3


RFM Analysis


RFM stands for Recency, Frequency and Monetary. This segmentation technique is used for targeting the most valuable customers based on their recent activities, how often they contribute and the amount spent.
From the above plots, we can see that there a lot of first time donors. Building a recommendation solely based on their activity and frequency for these donors is difficult because we don’t really know their preferences yet. This leads to a cold start problem. Therefore by using RFM to cluster the donor we can eliminate this problem to an extent.
After analysis, I’ll be fitting clustering models to categorize projects and donors to connect them to the right projects.

#Merging donations and projects
merge5 = merge(df_donations, df_projects, by="Project.ID")
head(merge5)
##                         Project.ID                      Donation.ID
## 1 00000ce845c00cbf0686c992fc369df4 84d4bd0c34c8c28f9e0121118c24360f
## 2 00000ce845c00cbf0686c992fc369df4 19351e1d9ae0bccab31b1f6009ad47a3
## 3 00000ce845c00cbf0686c992fc369df4 987eecef69373f0d7ee9238652521fb2
## 4 00000ce845c00cbf0686c992fc369df4 d5364b1bb3b14594808bd6efa7544165
## 5 00000ce845c00cbf0686c992fc369df4 39af862cb04e4f938e5b827236a610a6
## 6 00000ce845c00cbf0686c992fc369df4 c47f78571f62bcf10eee6a46a4a8a85d
##                           Donor.ID Donation.Included.Optional.Donation
## 1 391f14831940fc7bc41df2cd7fb06030                                 Yes
## 2 bd323208dc78b1c74b62664b768f3176                                 Yes
## 3 531ed26f1a505282337e0d805be97281                                 Yes
## 4 6dd6113f89f2766d3b0707ef2a46260c                                 Yes
## 5 8a1875762c85932fff192ea126ccdff2                                 Yes
## 6 a3f070e439d52de72ca62dc41f9b16a4                                 Yes
##   Donation.Amount Donor.Cart.Sequence Donation.Received.Date Donation.year
## 1             100                   1    2013-02-27 09:55:18          2013
## 2             200                   2    2013-02-17 21:36:24          2013
## 3              25                   1    2013-02-27 09:57:57          2013
## 4              10                  44    2013-02-27 10:32:22          2013
## 5              50                   1    2013-02-27 09:07:51          2013
## 6              50                   2    2013-02-27 09:53:12          2013
##   Donation.month                        School.ID
## 1              2 9353b7e95a77a0dd6a67a0d27ac485e3
## 2              2 9353b7e95a77a0dd6a67a0d27ac485e3
## 3              2 9353b7e95a77a0dd6a67a0d27ac485e3
## 4              2 9353b7e95a77a0dd6a67a0d27ac485e3
## 5              2 9353b7e95a77a0dd6a67a0d27ac485e3
## 6              2 9353b7e95a77a0dd6a67a0d27ac485e3
##                         Teacher.ID Teacher.Project.Posted.Sequence Project.Type
## 1 1083172499055a6c9f3bb3013ba7742b                               1  Teacher-Led
## 2 1083172499055a6c9f3bb3013ba7742b                               1  Teacher-Led
## 3 1083172499055a6c9f3bb3013ba7742b                               1  Teacher-Led
## 4 1083172499055a6c9f3bb3013ba7742b                               1  Teacher-Led
## 5 1083172499055a6c9f3bb3013ba7742b                               1  Teacher-Led
## 6 1083172499055a6c9f3bb3013ba7742b                               1  Teacher-Led
##           Project.Title
## 1 Can You Hear Me Now? 
## 2 Can You Hear Me Now? 
## 3 Can You Hear Me Now? 
## 4 Can You Hear Me Now? 
## 5 Can You Hear Me Now? 
## 6 Can You Hear Me Now? 
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Project.Essay
## 1 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! <!--DONOTREMOVEESSAYDIVIDER-->My students attend middle school in a low-income neighborhood in New York that presents a lot of difficulties. Many of my students come to school distracted by housing issues, hunger, and gang-related pressures that you can literally hear outside the school walls. <!--DONOTREMOVEESSAYDIVIDER-->There's no better way to drown the gang-related pressures of our neighborhood than to provide my students with headphones! Donating to this project will help my students get lost in learning with our new computer lab, which will be used to instruct students through an innovative new method called blended learning. Be a part of the future of education! <!--DONOTREMOVEESSAYDIVIDER-->This project is so important because it is so innovative. In an age where video games and television are seemingly the only things that will hold our students' attention, we have an opportunity to teach them using these same tools. Please, be a part of changing the trajectories of my students' lives. 
## 2 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! <!--DONOTREMOVEESSAYDIVIDER-->My students attend middle school in a low-income neighborhood in New York that presents a lot of difficulties. Many of my students come to school distracted by housing issues, hunger, and gang-related pressures that you can literally hear outside the school walls. <!--DONOTREMOVEESSAYDIVIDER-->There's no better way to drown the gang-related pressures of our neighborhood than to provide my students with headphones! Donating to this project will help my students get lost in learning with our new computer lab, which will be used to instruct students through an innovative new method called blended learning. Be a part of the future of education! <!--DONOTREMOVEESSAYDIVIDER-->This project is so important because it is so innovative. In an age where video games and television are seemingly the only things that will hold our students' attention, we have an opportunity to teach them using these same tools. Please, be a part of changing the trajectories of my students' lives. 
## 3 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! <!--DONOTREMOVEESSAYDIVIDER-->My students attend middle school in a low-income neighborhood in New York that presents a lot of difficulties. Many of my students come to school distracted by housing issues, hunger, and gang-related pressures that you can literally hear outside the school walls. <!--DONOTREMOVEESSAYDIVIDER-->There's no better way to drown the gang-related pressures of our neighborhood than to provide my students with headphones! Donating to this project will help my students get lost in learning with our new computer lab, which will be used to instruct students through an innovative new method called blended learning. Be a part of the future of education! <!--DONOTREMOVEESSAYDIVIDER-->This project is so important because it is so innovative. In an age where video games and television are seemingly the only things that will hold our students' attention, we have an opportunity to teach them using these same tools. Please, be a part of changing the trajectories of my students' lives. 
## 4 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! <!--DONOTREMOVEESSAYDIVIDER-->My students attend middle school in a low-income neighborhood in New York that presents a lot of difficulties. Many of my students come to school distracted by housing issues, hunger, and gang-related pressures that you can literally hear outside the school walls. <!--DONOTREMOVEESSAYDIVIDER-->There's no better way to drown the gang-related pressures of our neighborhood than to provide my students with headphones! Donating to this project will help my students get lost in learning with our new computer lab, which will be used to instruct students through an innovative new method called blended learning. Be a part of the future of education! <!--DONOTREMOVEESSAYDIVIDER-->This project is so important because it is so innovative. In an age where video games and television are seemingly the only things that will hold our students' attention, we have an opportunity to teach them using these same tools. Please, be a part of changing the trajectories of my students' lives. 
## 5 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! <!--DONOTREMOVEESSAYDIVIDER-->My students attend middle school in a low-income neighborhood in New York that presents a lot of difficulties. Many of my students come to school distracted by housing issues, hunger, and gang-related pressures that you can literally hear outside the school walls. <!--DONOTREMOVEESSAYDIVIDER-->There's no better way to drown the gang-related pressures of our neighborhood than to provide my students with headphones! Donating to this project will help my students get lost in learning with our new computer lab, which will be used to instruct students through an innovative new method called blended learning. Be a part of the future of education! <!--DONOTREMOVEESSAYDIVIDER-->This project is so important because it is so innovative. In an age where video games and television are seemingly the only things that will hold our students' attention, we have an opportunity to teach them using these same tools. Please, be a part of changing the trajectories of my students' lives. 
## 6 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! <!--DONOTREMOVEESSAYDIVIDER-->My students attend middle school in a low-income neighborhood in New York that presents a lot of difficulties. Many of my students come to school distracted by housing issues, hunger, and gang-related pressures that you can literally hear outside the school walls. <!--DONOTREMOVEESSAYDIVIDER-->There's no better way to drown the gang-related pressures of our neighborhood than to provide my students with headphones! Donating to this project will help my students get lost in learning with our new computer lab, which will be used to instruct students through an innovative new method called blended learning. Be a part of the future of education! <!--DONOTREMOVEESSAYDIVIDER-->This project is so important because it is so innovative. In an age where video games and television are seemingly the only things that will hold our students' attention, we have an opportunity to teach them using these same tools. Please, be a part of changing the trajectories of my students' lives. 
##                                                                                                                                                                               Project.Short.Description
## 1 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! My students attend middle school in a low-income neighborhood in...
## 2 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! My students attend middle school in a low-income neighborhood in...
## 3 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! My students attend middle school in a low-income neighborhood in...
## 4 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! My students attend middle school in a low-income neighborhood in...
## 5 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! My students attend middle school in a low-income neighborhood in...
## 6 Find a way or make one. That's our class motto and that's our attitude towards learning. Help us make a way to achieve greatness! My students attend middle school in a low-income neighborhood in...
##                                                                                                                       Project.Need.Statement
## 1 My students need headphones in order to engage in blended learning using our computer lab. Join us in our innovative approach to learning!
## 2 My students need headphones in order to engage in blended learning using our computer lab. Join us in our innovative approach to learning!
## 3 My students need headphones in order to engage in blended learning using our computer lab. Join us in our innovative approach to learning!
## 4 My students need headphones in order to engage in blended learning using our computer lab. Join us in our innovative approach to learning!
## 5 My students need headphones in order to engage in blended learning using our computer lab. Join us in our innovative approach to learning!
## 6 My students need headphones in order to engage in blended learning using our computer lab. Join us in our innovative approach to learning!
##   Project.Subject.Category.Tree Project.Subject.Subcategory.Tree
## 1           Literacy & Language                         Literacy
## 2           Literacy & Language                         Literacy
## 3           Literacy & Language                         Literacy
## 4           Literacy & Language                         Literacy
## 5           Literacy & Language                         Literacy
## 6           Literacy & Language                         Literacy
##   Project.Grade.Level.Category Project.Resource.Category Project.Cost
## 1                   Grades 6-8                Technology       710.07
## 2                   Grades 6-8                Technology       710.07
## 3                   Grades 6-8                Technology       710.07
## 4                   Grades 6-8                Technology       710.07
## 5                   Grades 6-8                Technology       710.07
## 6                   Grades 6-8                Technology       710.07
##   Project.Posted.Date Project.Expiration.Date Project.Current.Status
## 1          2013-02-02              2013-03-01           Fully Funded
## 2          2013-02-02              2013-03-01           Fully Funded
## 3          2013-02-02              2013-03-01           Fully Funded
## 4          2013-02-02              2013-03-01           Fully Funded
## 5          2013-02-02              2013-03-01           Fully Funded
## 6          2013-02-02              2013-03-01           Fully Funded
##   Project.Fully.Funded.Date                 cat   subcat
## 1                2013-02-27 Literacy & Language Literacy
## 2                2013-02-27 Literacy & Language Literacy
## 3                2013-02-27 Literacy & Language Literacy
## 4                2013-02-27 Literacy & Language Literacy
## 5                2013-02-27 Literacy & Language Literacy
## 6                2013-02-27 Literacy & Language Literacy
#Taking sample of the data
set.seed(123)
sample.df = merge5
#Monetary
monetary <- sample.df %>% group_by(Donor.ID) %>% summarise(total.donation = sum(Donation.Amount))
## `summarise()` ungrouping output (override with `.groups` argument)
head(monetary)
## # A tibble: 6 x 2
##   Donor.ID                         total.donation
##   <chr>                                     <dbl>
## 1 0000c14308c4cb9259a4fe51f692c9ef             50
## 2 0000d2c093a6301ef33925c06af2c6d1             20
## 3 00010615b56ff057fa00b5144fe2e4cf             25
## 4 0001c6641864a240eff74544a3596acf            150
## 5 00028d5d75335732b1c46c54c5c847dd             25
## 6 00028e5aea55cf68a3216cd74ecceff1             10
#Frequency
frequency <- sample.df %>% group_by(Donor.ID ,Donation.ID) %>% 
   summarise(freq = length(Donation.ID)) %>% 
   group_by(Donor.ID) %>% summarise(frequency = sum(freq))
## `summarise()` regrouping output by 'Donor.ID' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
head(frequency)
## # A tibble: 6 x 2
##   Donor.ID                         frequency
##   <chr>                                <int>
## 1 0000c14308c4cb9259a4fe51f692c9ef         2
## 2 0000d2c093a6301ef33925c06af2c6d1         1
## 3 00010615b56ff057fa00b5144fe2e4cf         1
## 4 0001c6641864a240eff74544a3596acf         2
## 5 00028d5d75335732b1c46c54c5c847dd         1
## 6 00028e5aea55cf68a3216cd74ecceff1         1
#Recency
#Finding the latest date and adding 1 to received date to get the most recent donation date
sample.df$Donation.Received.Date <- as.Date(sample.df$Donation.Received.Date, "%Y-%m-%d %H:%M:%S")
maximum <- max(sample.df$Donation.Received.Date)
maximum <- maximum +1

#Difference between new and old will give the recent number of days or recency
#of each donor
recency <- sample.df %>% group_by(Donor.ID, Donation.Received.Date)
recency$diff <- maximum - recency$Donation.Received.Date
recency <- recency %>% group_by(Donor.ID) %>% summarise(recency = min(diff))
## `summarise()` ungrouping output (override with `.groups` argument)
head(recency)
## # A tibble: 6 x 2
##   Donor.ID                         recency 
##   <chr>                            <drtn>  
## 1 0000c14308c4cb9259a4fe51f692c9ef 147 days
## 2 0000d2c093a6301ef33925c06af2c6d1 109 days
## 3 00010615b56ff057fa00b5144fe2e4cf 232 days
## 4 0001c6641864a240eff74544a3596acf 152 days
## 5 00028d5d75335732b1c46c54c5c847dd 213 days
## 6 00028e5aea55cf68a3216cd74ecceff1 209 days
#RFM dataframe
RFM = merge(recency, frequency, by="Donor.ID")
RFM = merge(RFM, monetary, by="Donor.ID")
#Converting recency to numeric
RFM$recency = as.numeric(RFM$recency)

Recency vs Frequency

ggplot(RFM, aes(x = recency, y = frequency)) + geom_point(aes(color = recency), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07")) + theme_bw()

We can see that recently visiting donors have donated more frequently

Recency vs Monetary

ggplot(RFM, aes(x = recency, y = total.donation)) + geom_point(aes(color = recency), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07")) + theme_bw()

More recently visiting donors have contributed more

Frequency vs Monetary

ggplot(RFM, aes(x = frequency, y = total.donation)) + geom_point(aes(color = frequency), size = 3) +
  scale_color_gradientn(colors = c("#00AFBB", "#E7B800", "#FC4E07")) + theme_bw()

The above plot shows that high number of donations needn’t lead to high amount of donations.


Clustering Analysis


#Standard scaling the data
Scaled_RFM = scale(RFM[,-1])
head(Scaled_RFM)
##         recency   frequency total.donation
## [1,] -0.6477374  0.02161553    -0.04182617
## [2,] -1.2894395 -0.04786435    -0.08360449
## [3,]  0.7876490 -0.04786435    -0.07664144
## [4,] -0.5633029  0.02161553     0.09743489
## [5,]  0.4667980 -0.04786435    -0.07664144
## [6,]  0.3992504 -0.04786435    -0.09753060

K Means Clustering

Km = kmeans(Scaled_RFM, centers = 4, iter.max = 50)
clusters <- Km$cluster
RFM.clusters = cbind(RFM, clusters)
RFM.clusters$clusters <- as.factor(RFM.clusters$clusters)
ggplot(RFM.clusters,
      aes(x = frequency, y = total.donation, colour = clusters, shape = clusters)) +
  geom_point(position = 'jitter') +
  theme_bw() +
  labs(title = "Total donation made w.r.t frequency", shape = "cluster", colour = "cluster")

ggplot(RFM.clusters,
  aes(x = recency, y = total.donation, colour = clusters, shape = clusters)) +
geom_point() +
theme_bw() +
labs(title = "Total donation made w.r.t recency", shape = "cluster", colour = "cluster")

ggplot(RFM.clusters,
  aes(x = recency, y = frequency, colour = clusters, shape = clusters)) +
geom_point() +
theme_bw() +
labs(title = "Frequency of donation w.r.t recency", shape = "cluster", colour = "cluster")

Checking stability of clusters

RFM.clusters$clusters <- as.numeric(RFM.clusters$clusters)
clust_no <- 4
library(fpc)
## Warning: package 'fpc' was built under R version 4.0.5
set.seed(1234)
evaluate <- clusterboot(RFM.clusters[,-1], 
                      B=100, bootmethod="boot",
                     clustermethod=kmeansCBI,
                    krange=4, seed=20)
## boot 1 
## boot 2 
## boot 3 
## boot 4 
## boot 5 
## boot 6 
## boot 7 
## boot 8 
## boot 9 
## boot 10 
## boot 11 
## boot 12 
## boot 13 
## boot 14 
## boot 15 
## boot 16 
## boot 17 
## boot 18 
## boot 19 
## boot 20 
## boot 21 
## boot 22 
## boot 23 
## boot 24 
## boot 25 
## boot 26 
## boot 27 
## boot 28 
## boot 29 
## boot 30 
## boot 31 
## boot 32 
## boot 33 
## boot 34 
## boot 35 
## boot 36 
## boot 37 
## boot 38 
## boot 39 
## boot 40 
## boot 41 
## boot 42 
## boot 43 
## boot 44 
## boot 45 
## boot 46 
## boot 47 
## boot 48 
## boot 49 
## boot 50 
## boot 51 
## boot 52 
## boot 53 
## boot 54 
## boot 55 
## boot 56 
## boot 57 
## boot 58 
## boot 59 
## boot 60 
## boot 61 
## boot 62 
## boot 63 
## boot 64 
## boot 65 
## boot 66 
## boot 67 
## boot 68 
## boot 69 
## boot 70 
## boot 71 
## boot 72 
## boot 73 
## boot 74 
## boot 75 
## boot 76 
## boot 77 
## boot 78 
## boot 79 
## boot 80 
## boot 81 
## boot 82 
## boot 83 
## boot 84 
## boot 85 
## boot 86 
## boot 87 
## boot 88 
## boot 89 
## boot 90 
## boot 91 
## boot 92 
## boot 93 
## boot 94 
## boot 95 
## boot 96 
## boot 97 
## boot 98 
## boot 99 
## boot 100
bootMean_clus <- data.frame(cluster = 1:4, bootMeans = evaluate$bootmean) 
ggplot(bootMean_clus, aes(cluster, bootMeans)) +
  geom_point(aes(colour = "darkred", size = 1)) +
 geom_hline(yintercept = c(0.6, 0.8)) +
labs(y = "stability", title = "Stability evaluation") +
theme_bw()+ theme(legend.position="none")

Clusters 1,is highly stable with stability factor close to 1. Cluster 2, 3, 4 are moderately stable having a value between 0.6 and 0.8. Overall, the model is stable and hence we can proceed with k=4.

evaluate$bootmean
## [1] 0.9941291 0.6853333 0.7101815 0.7263345

Values closer to 1 indicate high stability.

#Filtering clusters
cluster1 <- RFM.clusters %>% filter(clusters == '1')
cluster2 <- RFM.clusters %>% filter(clusters == '2')
cluster3 <- RFM.clusters %>% filter(clusters == '3')
cluster4 <- RFM.clusters %>% filter(clusters == '4')

Merging the cluster RFM values with rest of the data

cluster1 = merge(cluster1, merge5, by = "Donor.ID", all=FALSE)
cluster2 = merge(cluster2, merge5, by = "Donor.ID", all=FALSE)
cluster3 = merge(cluster3, merge5, by = "Donor.ID", all=FALSE)
cluster4 = merge(cluster4, merge5, by = "Donor.ID", all=FALSE)
cluster1 %>% group_by(cat)%>% filter(!(is.na(cat))) %>%
summarise(top_cat = length(Donation.ID)) %>%mutate(pct = prop.table(top_cat)*100)%>%
    ggplot(aes(x = reorder(cat, -pct), y = pct, fill = cat)) + 
   geom_bar(stat = 'identity') + scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.1f%%", pct)), hjust = 0.5,
            vjust = -0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+
              ggtitle("Top Categories in Cluster 1")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster2 %>% group_by(cat)%>% filter(!(is.na(cat))) %>%
summarise(top_cat = length(Donation.ID)) %>%mutate(pct = prop.table(top_cat)*100)%>%
    ggplot(aes(x = reorder(cat, -pct), y = pct, fill = cat)) + 
   geom_bar(stat = 'identity') + scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.1f%%", pct)), hjust = 0.5,
            vjust = -0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+
              ggtitle("Top Categories in Cluster 2")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster3 %>% group_by(cat)%>% filter(!(is.na(cat))) %>%
summarise(top_cat = length(Donation.ID)) %>%mutate(pct = prop.table(top_cat)*100)%>%
    ggplot(aes(x = reorder(cat, -pct), y = pct, fill = cat)) + 
   geom_bar(stat = 'identity') + scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.1f%%", pct)), hjust = 0.5,
            vjust = -0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+
              ggtitle("Top Categories in Cluster 3")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster4 %>% group_by(cat)%>% filter(!(is.na(cat))) %>%
summarise(top_cat = length(Donation.ID)) %>%mutate(pct = prop.table(top_cat)*100)%>%
    ggplot(aes(x = reorder(cat, -pct), y = pct, fill = cat)) + 
   geom_bar(stat = 'identity') + scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.1f%%", pct)), hjust = 0.5,
            vjust = -0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+
              ggtitle("Top Categories in Cluster 4")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster1 %>% group_by(subcat)%>% filter(!(is.na(subcat))) %>%
summarise(top_subcat = length(Donation.ID)) %>%mutate(pct = prop.table(top_subcat)*100)%>%
    ggplot(aes(x = reorder(subcat, pct), y = pct, fill = subcat)) + 
   geom_bar(stat = 'identity') + #scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.2f%%", pct)), hjust = -0.5,
            vjust = 0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+coord_flip()+
              ggtitle("Cluster 1 Top Sub-Categories")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster2 %>% group_by(subcat)%>% filter(!(is.na(subcat))) %>%
summarise(top_subcat = length(Donation.ID)) %>%mutate(pct = prop.table(top_subcat)*100)%>%
    ggplot(aes(x = reorder(subcat, pct), y = pct, fill = subcat)) + 
   geom_bar(stat = 'identity') + #scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.2f%%", pct)), hjust = -0.5,
            vjust = 0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+coord_flip()+
              ggtitle("Cluster 2 Top Sub-Categories")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster3 %>% group_by(subcat)%>% filter(!(is.na(subcat))) %>%
summarise(top_subcat = length(Donation.ID)) %>%mutate(pct = prop.table(top_subcat)*100)%>%
    ggplot(aes(x = reorder(subcat, pct), y = pct, fill = subcat)) + 
   geom_bar(stat = 'identity') + #scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.2f%%", pct)), hjust = -0.5,
            vjust = 0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+coord_flip()+
              ggtitle("Cluster 3 Top Sub-Categories")
## `summarise()` ungrouping output (override with `.groups` argument)

cluster4 %>% group_by(subcat)%>% filter(!(is.na(subcat))) %>%
summarise(top_subcat = length(Donation.ID)) %>%mutate(pct = prop.table(top_subcat)*100)%>%
    ggplot(aes(x = reorder(subcat, pct), y = pct, fill = subcat)) + 
   geom_bar(stat = 'identity') + #scale_fill_brewer(palette="Set1")+
    geom_text(aes(label = sprintf("%.2f%%", pct)), hjust = -0.5,
            vjust = 0.5, size =3)+ theme_bw()+  xlab("")  + ylab("Percent")+coord_flip()+
              ggtitle("Cluster 4 Top Sub-Categories")
## `summarise()` ungrouping output (override with `.groups` argument)


Inferences from the above plots

(Subject to change with each knit)

Cluster 1 Donors in cluster 1 donate very frequently and recently. Even though the major project categories are same for all clusters, Cluster 1 donors prefer music and arts more than the donors from other clusters. The proportion is almost the same as applied learning.
The top 5 recommended subcategories are :-
  • Literacy
  • Language & Writing
  • Music
  • Mathematics
  • Applied Sciences

  • Cluster 2 Cluster 2 donors are not very recent donors and their total donation is mostly concentrated within 1000 dollars. Their frequency of donation varies from 1-60. Unlike cluster 1 donors, these donors prefer scientific projects over music and arts. They also prefer professional (ESL) and early development more than other clusters.
    The top 5 recommended subcategories are :-
  • Literacy
  • Language & Writing
  • Applied Sciences
  • Mathematics
  • Environmental Sciences

  • Cluster 3 The Cluster 3 donors are those who donated to a project not very long ago but are not that recent either. Like cluster 1, they also prefer music but on contrary prefer professional (ESL) and early development more than other clusters. This suggests that these donors prefer overall development of an individual.
    The top 5 recommended subcategories are :-
  • Literacy
  • Language & Writing
  • Music
  • Mathematics
  • Early Development

  • Cluster 4 Cluster 4 donors don’t donate frequently. They donate only once in a long time. Donors in this cluster may prefer Music & Arts over character development. This cluster shows a strong similarity with cluster 2. However, their recency and frequency is in different ranges.
    The top 5 recommended subcategories are :-
  • Literacy
  • Language & Writing
  • Applied Sciences
  • Mathematics
  • Environmental Sciences


  • Top 10 projects (Project ID) in each cluster based on the number of donations

    Although, high number donations needn’t lead to high amount.


    #### Cluster 1

    cluster1 %>% group_by(Project.ID)%>%summarize(no.of.donors=n())%>% 
      arrange(desc(no.of.donors)) %>% 
      top_n(10) 
    ## `summarise()` ungrouping output (override with `.groups` argument)
    ## Selecting by no.of.donors
    ## # A tibble: 10 x 2
    ##    Project.ID                       no.of.donors
    ##    <chr>                                   <int>
    ##  1 8e876a18af7e2df97f74648f7aea6dde           79
    ##  2 2a84cb3c1439994b368f4b51dc099678           72
    ##  3 606cfcd8c686599b99c6dd75697bdc77           58
    ##  4 29c241a7b2254962cfaae49da58ad948           57
    ##  5 8c88fe4f090a656861616dfed49ec116           55
    ##  6 c1afe2eed335a024a34cfb82f281f08a           52
    ##  7 09a6c55b4b7b8e20a056c9953bf78d18           51
    ##  8 1993030c8f00a03a700ebb46d8ca66fb           46
    ##  9 3cef9c6b38eb98b50ab9d82616a95fef           44
    ## 10 817e050cf3ce2c72ccb3cd590da33dc3           41

    Cluster 2

    cluster2 %>% group_by(Project.ID)%>%summarize(no.of.donors=n())%>% 
      arrange(desc(no.of.donors)) %>% 
      top_n(10) 
    ## `summarise()` ungrouping output (override with `.groups` argument)
    ## Selecting by no.of.donors
    ## # A tibble: 10 x 2
    ##    Project.ID                       no.of.donors
    ##    <chr>                                   <int>
    ##  1 070d39ec9af57d2c05377ae36f73195f           87
    ##  2 39f37e7b3307b512743899a42a42f535           66
    ##  3 7cd8c5a63c9196b0a5ed83007d0f6919           64
    ##  4 936042944d387906f01b48a3363decfa           61
    ##  5 074e5293502c4b8ca6488c7a9ea7a8a3           59
    ##  6 8c88fe4f090a656861616dfed49ec116           57
    ##  7 e6cdda4fd56550934e9516ed06811976           56
    ##  8 fc9e4a1d4dff629ccd0a2077d9b33ab8           55
    ##  9 dc679201bc026f33a4ac5041f19e8963           54
    ## 10 ca58e3e932e3b5ee31cd377ee727c215           53

    Cluster 3

    cluster3 %>% group_by(Project.ID)%>%summarize(no.of.donors=n())%>% 
      arrange(desc(no.of.donors)) %>% 
      top_n(10) 
    ## `summarise()` ungrouping output (override with `.groups` argument)
    ## Selecting by no.of.donors
    ## # A tibble: 11 x 2
    ##    Project.ID                       no.of.donors
    ##    <chr>                                   <int>
    ##  1 4fb102f31260c90c7a0551c0915a82f4          202
    ##  2 cd40fecbe9d077f881792af676b854d2          148
    ##  3 e948970dafb802e486b76e4de2a0dc06          147
    ##  4 f53f237da78274c61dc7036d1a340161          146
    ##  5 144db90dfae1670c6d1cb915fc238065          135
    ##  6 0f746a652a5d3384dbdac242b2d2b488          128
    ##  7 8e83c2e59073f18b20a1ed091eb2283e          124
    ##  8 271dc56a78b787cc127105b0d875f084          122
    ##  9 7a72af189ab55f9a46c228be253e8e8d          118
    ## 10 0941d08ea4f7484bfad65245f4ab372f          114
    ## 11 9d4f963f235283008b2990085e8d990f          114

    Cluster 4

    cluster4 %>% group_by(Project.ID)%>%summarize(no.of.donors=n())%>% 
      arrange(desc(no.of.donors)) %>% 
      top_n(10) 
    ## `summarise()` ungrouping output (override with `.groups` argument)
    ## Selecting by no.of.donors
    ## # A tibble: 10 x 2
    ##    Project.ID                       no.of.donors
    ##    <chr>                                   <int>
    ##  1 36eda229d6f38a9ad0900985637c7cc4           74
    ##  2 4eafa5d4906d6795dc943d529abc9148           59
    ##  3 e476ca17a08780835fa6c990627ac9b2           52
    ##  4 12066a7b6052a180d0b8fac81d56f23b           49
    ##  5 8643219d13613654c49e708c1e5b8920           39
    ##  6 93b20458abba61fe6b7f912fa8910aab           39
    ##  7 db50734f945b37f9a2e94e3c4ab106ee           39
    ##  8 7f145e205d9cd25a162aa6fb4d04be2d           34
    ##  9 9c8c7f83f149ed02c7d11f9f53e0a63b           34
    ## 10 8da1fca0b119f38d1c718aa74b6c8366           33